package org.twelve.msll.parser;

import org.twelve.msll.grammar.Grammars;
import org.twelve.msll.grammarsymbol.NonTerminal;
import org.twelve.msll.grammarsymbol.NonTerminals;
import org.twelve.msll.grammarsymbol.Terminals;
import org.twelve.msll.lexer.Token;
import org.twelve.msll.parsetree.NonTerminalNode;
import org.twelve.msll.parsetree.ParseNode;
import org.twelve.msll.parsetree.ParserTree;
import org.twelve.msll.parsetree.TerminalNode;
import org.twelve.msll.util.Constants;

import java.io.Reader;

import static org.twelve.msll.util.Tool.cast;

/**
 * The MSLL parser's grammar is represented in the G4 format because CFG (Context-Free Grammar) format is often too
 * complex for language designers to work with directly.
 *
 * In this process, the MSLL parser uses the CFG format internally to parse a self-designed language. However, the G4 parser
 * acts as an intermediary, interpreting the G4 format (a higher-level grammar representation) and converting it into the
 * CFG format, which is then used by the MSLL parser for parsing.
 *
 * Essentially, this is a "language's language"—the G4 format itself is a language, represented by its own CFG grammar.
 * The G4 parser translates the G4 format into the CFG format so that the MSLL parser can interpret and process custom language definitions.
 * G4 parser also use msll theory to parse g4 file
 *
 * @author huizi 2024
 * @param <P> The type of Parse Tree generated by this parser.
 */
public abstract class G4Parser<P extends ParserTree> extends MsllParser<P> {

    /**
     * Constructor for initializing the G4Parser with grammar rules and a prediction table.
     *
     * @param grammars      The grammar rules that define the G4 parser grammar.
     * @param predictTable  The prediction table used for token matching during parsing.
     * @param nonTerminals  The set of non-terminal symbols in the grammar.
     * @param terminals     The set of terminal symbols used in the parsing process.
     * @param reader        The input source to be parsed (usually a G4 parser/lexer grammar file).
     */
    public G4Parser(Grammars grammars, PredictTable predictTable, NonTerminals nonTerminals, Terminals terminals, Reader reader) {
        super(grammars, predictTable, nonTerminals, terminals, reader);
    }

    /**
     * Optimizes the raw parse tree generated after parsing the G4 file by applying a series of node transformations.
     *
     * The method recursively processes each non-terminal node in the parse tree and applies various optimizations, such as:
     * - Merging productions.
     * - Optimizing single-child nodes.
     * - Optimizing factors (e.g., repetitions).
     * - Handling special cases like zero or one occurrence, zero or more occurrences, and one or more occurrences.
     *
     * @param node The current non-terminal node being processed.
     * @param root The root of the parse tree.
     */
    protected void optimizeNodes(NonTerminalNode node, NonTerminalNode root) {
        NonTerminalNode parent = node.parent();
        for (ParseNode n : node.nodes()) {
            if (n instanceof NonTerminalNode) {
                NonTerminalNode non = cast(n);
                fixedNonTerminal(n, non);
                optimizeNodes(non, root);// Recursive optimization for non-terminal nodes
            } else {
                processTerminal(cast(n));// Process terminal nodes
            }
        }
        //merge  production
        mergeProduction(node, parent);
        //optimize one child node
        cutOneChildNode(node, parent);
    }

    /**
     * Marks a non-terminal node as fixed, preventing it from being optimized even if it has only one child.
     *
     * This method is used to identify certain non-terminal nodes that should not be optimized, specifically when they
     * follow the pattern "<non-terminal> -> production".
     *
     * @param n   The current parse node being evaluated.
     * @param non The non-terminal node that may be marked as fixed.
     */
    private void fixedNonTerminal(ParseNode n, NonTerminalNode non) {
        if (n.name().equals(Constants.NON_TERMINAL) && non.nodes().get(0).symbol().type() == terminals.fromName(Constants.LESS_STR)) {
            non.removeNode(non.nodes().get(0));
            non.removeNode(non.nodes().get(1));
            non.setTag(Constants.FIX, true);
        }
    }

    /**
     * Optimizes nodes with only one child by replacing the node with its child, effectively flattening the parse tree.
     *
     * Nodes with only one child are unnecessary in many cases, and this method removes such nodes to simplify the
     * tree structure. Certain key nodes (like grammar or production nodes) are exempt from this optimization.
     *
     * @param node   The node being processed for potential flattening.
     * @param parent The parent of the current node.
     */
    protected static void cutOneChildNode(NonTerminalNode node, NonTerminalNode parent) {
        if (node.parent() != null && node.nodes().size() == 1) {
            if (node.name().equals(Constants.GRAMMARS)
                    || node.name().equals(Constants.PRODUCTIONS)
                    || node.name().equals(Constants.PRODUCTION)
                    || node.name().equals(Constants.NON_TERMINAL)
                    || node.name().equals(Constants.TERMINAL)) return;
            int index = parent.removeNode(node);
            parent.addNodes(node.nodes(), index);
        }
    }

    /**
     * Merges redundant or nested production nodes to simplify the parse tree.
     *
     * CFG parsers may generate deep, nested structures that result in redundant production nodes.
     * This method merges such nodes into a more concise representation, reducing tree depth and improving clarity.
     *
     * @param node   The current production node being processed.
     * @param parent The parent of the current production node.
     */
    protected static void mergeProduction(NonTerminalNode node, NonTerminalNode parent) {
        if (node.name().equals(Constants.PRODUCTION) && !parent.name().equals(Constants.PRODUCTIONS)) {
            int index = parent.removeNode(node);
            parent.addNodes(node.nodes(), index);
        }
    }

    /**
     * Processes terminal nodes by cleaning or modifying token values based on the terminal type.
     *
     * - For regex terminals, the surrounding quotes are removed.
     * - For string terminals, single or double quotes are removed.
     * - If the terminal is "OR", it is removed from the parse tree entirely.
     *
     * @param node The terminal node being processed.
     */
    private void processTerminal(TerminalNode node) {
        Token token = node.token();
        if (node.name().equals(Constants.REGEX)) {
            node.setToken(new Token(token.terminal(), token.lexeme().replaceAll("/\"|\"/", ""), token.location()));
        }

        if (node.name().equals(Constants.STRING)) {
            node.setToken(new Token(token.terminal(), token.lexeme().replaceAll("[\"']", ""), token.location()));
        }

        if (node.name().equals("OR")) {
            node.parent().removeNode(node);
        }
    }

    /**
     * Returns the head of the grammar, which defines how non-terminal symbols are recognized.
     *
     * This method is abstract and should be overridden by derived parsers to provide the appropriate
     * head node for different grammar types. For example:
     * - A G4 parser might recognize non-terminal symbols for grammar rules.
     * - A G4 lexer file might recognize terminal symbols for lexical rules.
     *
     * @return The non-terminal symbol that serves as the head of the grammar.
     */
    protected abstract NonTerminal getHead();

}
